Introduction
Removing Features

Introduction

This IPython notebook illustrates how to remove features from feature table. First, we need to import py_entitymatching package and other libraries as follows:



In [1]:

    
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd









    



/Users/pradap/miniconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Then, read the (sample) input tables for blocking purposes



In [3]:

    
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'



In [4]:

    
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')



In [5]:

    
# Get features
feature_table = em.get_features_for_blocking(A, B)

Removing Features from Feature Table



In [6]:

    
type(feature_table)









    Out[6]:





pandas.core.frame.DataFrame



In [9]:

    
feature_table.head()









    Out[9]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
      is_auto_generated
    
  
  
    
      0
      ID_ID_lev_dist
      ID
      ID
      None
      None
      lev_dist
      <function ID_ID_lev_dist at 0x109a7c048>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      1
      ID_ID_lev_sim
      ID
      ID
      None
      None
      lev_sim
      <function ID_ID_lev_sim at 0x11436a158>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      2
      ID_ID_jar
      ID
      ID
      None
      None
      jaro
      <function ID_ID_jar at 0x11436a1e0>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      3
      ID_ID_jwn
      ID
      ID
      None
      None
      jaro_winkler
      <function ID_ID_jwn at 0x11436a268>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      4
      ID_ID_exm
      ID
      ID
      None
      None
      exact_match
      <function ID_ID_exm at 0x11436a510>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True



In [11]:

    
# Drop first row
feature_table = feature_table.drop(0)



In [12]:

    
feature_table.head()









    Out[12]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
      is_auto_generated
    
  
  
    
      1
      ID_ID_lev_sim
      ID
      ID
      None
      None
      lev_sim
      <function ID_ID_lev_sim at 0x11436a158>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      2
      ID_ID_jar
      ID
      ID
      None
      None
      jaro
      <function ID_ID_jar at 0x11436a1e0>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      3
      ID_ID_jwn
      ID
      ID
      None
      None
      jaro_winkler
      <function ID_ID_jwn at 0x11436a268>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      4
      ID_ID_exm
      ID
      ID
      None
      None
      exact_match
      <function ID_ID_exm at 0x11436a510>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      5
      ID_ID_jac_qgm_3_qgm_3
      ID
      ID
      qgm_3
      qgm_3
      jaccard
      <function ID_ID_jac_qgm_3_qgm_3 at 0x11436a6a8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True



In [15]:

    
#Remove all the features except involving name (Include only the features where the left attribute is name)
feature_table = feature_table[feature_table.left_attribute=='name']



In [14]:

    
feature_table









    Out[14]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
      is_auto_generated
    
  
  
    
      6
      name_name_jac_qgm_3_qgm_3
      name
      name
      qgm_3
      qgm_3
      jaccard
      <function name_name_jac_qgm_3_qgm_3 at 0x11436a730>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      7
      name_name_cos_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      cosine
      <function name_name_cos_dlm_dc0_dlm_dc0 at 0x11436a7b8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      8
      name_name_jac_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      jaccard
      <function name_name_jac_dlm_dc0_dlm_dc0 at 0x11436a840>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      9
      name_name_mel
      name
      name
      None
      None
      monge_elkan
      <function name_name_mel at 0x11436a8c8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      10
      name_name_lev_dist
      name
      name
      None
      None
      lev_dist
      <function name_name_lev_dist at 0x11436a950>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      11
      name_name_lev_sim
      name
      name
      None
      None
      lev_sim
      <function name_name_lev_sim at 0x11436a9d8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      12
      name_name_nmw
      name
      name
      None
      None
      needleman_wunsch
      <function name_name_nmw at 0x11436aa60>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      13
      name_name_sw
      name
      name
      None
      None
      smith_waterman
      <function name_name_sw at 0x11436aae8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True



In [16]:

    
#Remove all the features except involving jaccard (Include only the features where the sim function is jaccard)
feature_table = feature_table[feature_table.simfunction=='jaccard']



In [17]:

    
feature_table









    Out[17]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
      is_auto_generated
    
  
  
    
      6
      name_name_jac_qgm_3_qgm_3
      name
      name
      qgm_3
      qgm_3
      jaccard
      <function name_name_jac_qgm_3_qgm_3 at 0x11436a730>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      8
      name_name_jac_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      jaccard
      <function name_name_jac_dlm_dc0_dlm_dc0 at 0x11436a840>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True

	feature_name	left_attribute	right_attribute	left_attr_tokenizer	right_attr_tokenizer	simfunction	function	function_source	is_auto_generated
0	ID_ID_lev_dist	ID	ID	None	None	lev_dist	<function ID_ID_lev_dist at 0x109a7c048>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
1	ID_ID_lev_sim	ID	ID	None	None	lev_sim	<function ID_ID_lev_sim at 0x11436a158>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
2	ID_ID_jar	ID	ID	None	None	jaro	<function ID_ID_jar at 0x11436a1e0>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
3	ID_ID_jwn	ID	ID	None	None	jaro_winkler	<function ID_ID_jwn at 0x11436a268>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
4	ID_ID_exm	ID	ID	None	None	exact_match	<function ID_ID_exm at 0x11436a510>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True

	feature_name	left_attribute	right_attribute	left_attr_tokenizer	right_attr_tokenizer	simfunction	function	function_source	is_auto_generated
6	name_name_jac_qgm_3_qgm_3	name	name	qgm_3	qgm_3	jaccard	<function name_name_jac_qgm_3_qgm_3 at 0x11436a730>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
7	name_name_cos_dlm_dc0_dlm_dc0	name	name	dlm_dc0	dlm_dc0	cosine	<function name_name_cos_dlm_dc0_dlm_dc0 at 0x11436a7b8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
8	name_name_jac_dlm_dc0_dlm_dc0	name	name	dlm_dc0	dlm_dc0	jaccard	<function name_name_jac_dlm_dc0_dlm_dc0 at 0x11436a840>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
9	name_name_mel	name	name	None	None	monge_elkan	<function name_name_mel at 0x11436a8c8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
10	name_name_lev_dist	name	name	None	None	lev_dist	<function name_name_lev_dist at 0x11436a950>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
11	name_name_lev_sim	name	name	None	None	lev_sim	<function name_name_lev_sim at 0x11436a9d8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
12	name_name_nmw	name	name	None	None	needleman_wunsch	<function name_name_nmw at 0x11436aa60>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
13	name_name_sw	name	name	None	None	smith_waterman	<function name_name_sw at 0x11436aae8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True

Contents

Introduction

Removing Features from Feature Table